src/Oligodendroglia/[RC17]GO_for_Oligodendroglia.Rmd · MonolayerCultures

---
title: '[RC17]GO for Oligodendroglia'
author: "Nina-Lydia Kazakou"
date: "15/03/2022"
output: html_document
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Set-up

### Load libraries

```{r message=FALSE, warning=FALSE}
library(SingleCellExperiment)
library(Seurat)
library(tidyverse)
library(Matrix)
library(scales)
library(cowplot)
library(RCurl)
library(ggplot2)
library(edgeR)
library(dplyr)
library(ggsci)  
library(here)
library(clusterProfiler)
library(biomaRt)
library(ReactomePA)
library(org.Hs.eg.db)
library(enrichplot)
library(RColorBrewer)
library(plotrix)
```

### Colour Palette

```{r load_palette}
mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3<-pal_lancet("lanonc", alpha = 0.7)(9)
mypal4<-pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)
mycoloursP<- c(mypal, mypal2, mypal3, mypal4)
```

### Set up for distinctive colours
```{r}
plot_colour <- grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = T)]
```

### Load object

```{r}
oligos <- readRDS(here("Data", "Processed", "Oligodendroglia", "hES-derived_monolayerOL.rds"))
```

```{r}
DimPlot(oligos, group.by = "ClusterID", pt.size = 0.9, label = TRUE, cols = mycoloursP[6:40]) & NoLegend()
```

### Create Directory
```{r eval=FALSE}
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis"))
```


# GO Alanlysis

I use three types of data for GO analysis:
* Top100 Deferentially Expressed Genes among clusters with p < 0.05 & avg_log2FC > 0.5 (These genes will be taken from the analysis I've done in the past)
* Top100 genes from raw counts 
* Top Deferentially Expressed Genes among clusters, filtered here for p < 0.05 & avg_log2FC > 0.4.

For the first category I will have GO data both from ClusterProfiler & ClueGO. 
For the second category I will have GO data only from ClueGO. 
For the third category I will have GO data only from ClusterProfiler.

I have already prepared the Top100 Deferentially Expressed Gene lists for each cluster manually. 


### Prepare raw_expr data:

When performing gene ontology on deferentially expressed genes, only the differences between clusters will be flagged, but not their similarities. Therefore, additionally to the gene ontology analysis on deferentially expressed genes, I will perform it on the top 100 genes (based on their raw expression level).

Here, I will prepare .csv files containing the raw counts for each cluster.
```{r eval=FALSE}
Idents(oligos) <- "ClusterID"

cluster_lev <- levels(oligos@meta.data$ClusterID)

for(i in 1 : length(cluster_lev)){
  clu_dat <- subset(oligos, ident = cluster_lev[i])
  expr <- data.frame(mean_exp = rowMeans(clu_dat@assays$RNA@counts), 
                     cluster = cluster_lev[i])
  expr <- expr[order(expr$mean_exp, decreasing = TRUE),, drop = FALSE] 
  file_n <- paste(cluster_lev[i], "av_raw_expr.csv", sep = "_")
  write.csv(expr, here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Avg_raw_expr", file_n))
}
```

I have used the Top100 avg_raw_expr genes from each cluster and run them through ClueGO. The results have been ploted using GraphPad Prism. 


## DE Genes with avg_log2FC > 0.4 & p_val_adj < 0.05 (filter here)

### Read-in Cluster Marker Lists:
```{r}
oligo_file_path <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Input_Data", "DE_Genes", "Separate_Cluster_Markers")

oligo_files <- list.files(oligo_file_path)
```


### Prepare marts to convert to entrez
```{r}
listMarts()
ensembl = useMart("ensembl", dataset="hsapiens_gene_ensembl",
                      host = "www.ensembl.org")
    
    
listDatasets(ensembl)
attributes = listAttributes(ensembl)
named_list <- list()
for(i in 1: length(oligo_files)){
  data <- read.csv(here(oligo_file_path, oligo_files[i]))
  cluster <- data[["cluster"]][1]
  fil_data <- subset(data, data$avg_log2FC > 0.4 & data$p_val_adj < 0.05)
  
  genes <- fil_data$gene
  
  # convert to ENTREZID
  
  genes_ent <- bitr(genes, fromType = "SYMBOL",
            toType = "ENTREZID",
            OrgDb = org.Hs.eg.db)
  
  named_list[[i]] <- genes_ent$ENTREZID
  names(named_list)[i] <- cluster
}
str(named_list)
ck_go <- compareCluster(geneCluster = named_list, fun = enrichGO, 
                          OrgDb = org.Hs.eg.db)
ck_go <- setReadable(ck_go, OrgDb = org.Hs.eg.db, keyType="ENTREZID")
  
```

```{r fig.height=22, fig.width=15}
ck_kegg <- compareCluster(geneCluster = named_list, fun = enrichKEGG)
ck_kegg <- setReadable(ck_kegg, OrgDb = org.Hs.eg.db, keyType="ENTREZID")
head(ck_kegg) 
dotplot(ck_kegg)
```

```{r fig.height=22, fig.width=15, eval=FALSE}
dp <- dotplot(ck_go) +
  theme(axis.text.x = element_text(angle = 90), axis.text.x.bottom = element_text(size = 15))
  
pdf(file = here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Plots" "Oligos.pdf"),   
              width = 15, 
              height = 22) 
print(dp)
dev.off()
```


```{r eval=FALSE, message=FALSE, warning=FALSE}
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Cluster_Plots"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "EMAP_Plots"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Molecular_Functions"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Biological_Process"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Symbol"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Tree"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Upset_Plots"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "HeatMaps"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Molecular_Functions_Tree"))

go_dir <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Cluster_Plots")
emap_dir <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "EMAP_Plots")
go_dir_mol_f <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Molecular_Functions")
go_dot <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Biological_Process")
go_dir_cnetpl <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Symbol")
go_dir_tree <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Tree")
go_dir_upset_plot <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Upset_Plots")
go_dir_hm <-here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "HeatMaps")
go_dir_mol_f_tree <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "Molecular_Functions_Tree")

for(i in 1: length(oligo_files)){
  data <- read.csv(here(oligo_file_path, oligo_files[i]))
  cluster <- data[["cluster"]][1]
  plot_label <- paste0(cluster, ".pdf")
  
  # filter data
  fil_data <- subset(data, data$p_val_adj < 0.05 &
                       avg_log2FC > 0.4)
  
  # Prepare input data
  ranks <- fil_data %>% dplyr::select(gene, avg_log2FC)
  ranks <- deframe(ranks)
  #RUN GO using clusterProfiler
    fil_data$Gene <- fil_data$gene
    #remove any duplicates (sanity check for me)
    genemodulesGO <- fil_data[!duplicated(fil_data$gene),]
    
    Biomart_gencode_ensembl84_biotypes <- getBM(attributes=c("ensembl_gene_id",
                                                             "entrezgene_id",
                                                             "gene_biotype",
                                                             "hgnc_symbol"), 
                                                filters = "hgnc_symbol", 
                                                values = fil_data$Gene, 
                                                mart = ensembl,
                         uniqueRows = FALSE)
    
    
    Biomart_gencode_ensembl84_biotypes[, 'gene_biotype'] <- 
      as.factor(Biomart_gencode_ensembl84_biotypes[,'gene_biotype'])
    
    #Filter genes
    #biotype_all_dataset <- subset(Biomart_gencode_ensembl84_biotypes, hgnc_symbol 
    #                              %in% oligos@assays$RNA@var.features)
    #entrezID <-  subset(biotype_all_dataset, biotype_all_dataset$hgnc_symbol
    #                    %in% oligos@assays$RNA@var.features)
   # if(nrow(biotype_all_dataset) == 0){
      biotype_all_dataset <- subset(Biomart_gencode_ensembl84_biotypes, hgnc_symbol 
                                  %in% rownames(oligos@assays$RNA))
      
      entrezID <-  subset(biotype_all_dataset, biotype_all_dataset$hgnc_symbol
                        %in% rownames(oligos@assays$RNA))
   # }
    
    entrezmatched <- entrezID[match(genemodulesGO$Gene,entrezID$hgnc_symbol),]
  
    #you might need to remove NAs
    entrezmatched <- entrezmatched[! apply(entrezmatched[,c(1,3)], 1,function (x) anyNA(x)),]
    allLLIDs <- entrezmatched$entrezgene
    
    modulesReactome <- enrichPathway(gene=as.character(allLLIDs),organism="human",
                                        pvalueCutoff=0.01,
                                        qvalueCutoff = 0.3,
                                        pAdjustMethod = "none", 
                                        readable=T)
    
    
  
    
    if(nrow(as.data.frame(modulesReactome)) > 0){
        dot_plot <- dotplot(modulesReactome, showCategory=20)
        
        pdf(file = here(go_dir, plot_label),   
          width = 12, 
          height = 12) 
        print(dot_plot)
        dev.off()
      
      
        x2 <- pairwise_termsim(modulesReactome) 
        emap_plot <- emapplot(x2)
        
        pdf(file = here(emap_dir, plot_label),   
          width = 12, 
          height = 12) 
        
        print(emap_plot)
        
        dev.off()
        
     #Alternative GO clasissification
        
         ent_uni <- rownames(oligos@assays$RNA)
         ent_uni_entrez <- bitr(ent_uni, fromType = "SYMBOL",
            toType = "ENTREZID",
            OrgDb = org.Hs.eg.db)
         
         ent_uni <- ent_uni_entrez$ENTREZID
         
         # Molecular function
         
          go_mf <- enrichGO(gene  = as.character(allLLIDs),
                universe      = ent_uni,
                OrgDb         = org.Hs.eg.db,
                ont           = "MF",
                pAdjustMethod = "BH",
                pvalueCutoff  = 1,
                qvalueCutoff  = 1,
                readable      = TRUE)
         
          
          go_mf_plot <- dotplot(go_mf, showCategory = 20)
          pdf(file = here(go_dir_mol_f, plot_label),   
                 width = 12, 
                height = 10) 
          print(go_mf_plot)
          dev.off()
          
        # GO biological process
        
        ego <- enrichGO(gene  = as.character(allLLIDs),
                OrgDb = org.Hs.eg.db,
                ont = "BP",
                universe = ent_uni )
                
        #head(ego)
        if(nrow(ego) > 0){
        dp2 <- dotplot(ego, showCategory=20)
    
    
        pdf(file = here(go_dot, plot_label),   
            width = 12, 
            height = 10) 
        print(dp2)
        dev.off()
        
        ## convert gene ID to Symbol
        edox <- setReadable(ego, 'org.Hs.eg.db', 'ENTREZID')
        cnet_pl <- cnetplot(edox, foldChange=ranks, circular = TRUE, 
                       colorEdge = TRUE)
    
    
        pdf(file = here(go_dir_cnetpl, plot_label),   
              width = 14, 
              height = 12) 
        print(cnet_pl)
        dev.off()
        
        
        
        edox2 <- enrichplot::pairwise_termsim(edox)
       
        
        if(nrow(edox2) > 1){
        tree_plot <- treeplot(edox2)
        
        pdf(file = here(go_dir_tree, plot_label),   # The directory you want to save the file in
              width = 16, # The width of the plot in inches
              height = 10) # The height of the plot in inches
        print(tree_plot)
        dev.off()
        }
        
       u_plot <- upsetplot(ego)
       
       pdf(file = here(go_dir_upset_plot, plot_label),   
              width = 15, 
              height = 10) 
        print(u_plot)
        dev.off()
       
        if(nrow(edox) > 1){
            hm <- heatplot(edox, foldChange=ranks, showCategory=5)
       
            pdf(file = here(go_dir_hm, plot_label),   # The directory you want to save the file in
              width = 15, # The width of the plot in inches
              height = 10) # The height of the plot in inches
            print(hm)
            dev.off()
        }
        }
     
        
        
        edox_mf <- setReadable(go_mf, 'org.Hs.eg.db', 'ENTREZID')
        edox2_mf <- pairwise_termsim(edox_mf)
        tree_plot_mf <- treeplot(edox2_mf)
        
        pdf(file = here(go_dir_mol_f_tree, plot_label),   
              width = 16, 
              height = 10) 
        print(tree_plot_mf)
        dev.off()
        
        }
}
```

### Create and save files that contain GO output for oligos:
```{r eval=FALSE, message=FALSE, warning=FALSE}
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "data"))
for(i in 1: length(oligo_files)){
  data <- read.csv(here(oligo_file_path, oligo_files[i]))
  cluster <- data[["cluster"]][1]
  plot_label <- paste0("Oligo_", cluster, ".pdf")
  
  # filter data
  fil_data <- subset(data, data$p_val_adj < 0.05 &
                       avg_log2FC > 0.4)
  #RUN GO using clusterProfiler
    fil_data$Gene <- fil_data$gene
    #remove any duplicates (sanity check for me)
    genemodulesGO <- fil_data[!duplicated(fil_data$Gene),]
    
    Biomart_gencode_ensembl84_biotypes <- getBM(attributes=c("ensembl_gene_id",
                                                             "entrezgene_id",
                                                             "gene_biotype",
                                                             "hgnc_symbol"), 
                                                filters = "hgnc_symbol", 
                                                values = fil_data$Gene, 
                                                mart = ensembl,
                         uniqueRows = FALSE)
    
    
    Biomart_gencode_ensembl84_biotypes[, 'gene_biotype'] <- 
      as.factor(Biomart_gencode_ensembl84_biotypes[,'gene_biotype'])
    
    biotype_all_dataset <- subset(Biomart_gencode_ensembl84_biotypes, hgnc_symbol 
                                  %in% rownames(oligos@assays$RNA))
      
    entrezID <-  subset(biotype_all_dataset, biotype_all_dataset$hgnc_symbol
                        %in% rownames(oligos@assays$RNA))
  
    
    entrezmatched <- entrezID[match(genemodulesGO$Gene,entrezID$hgnc_symbol),]
  
    #you might need to remove NAs
    entrezmatched <- entrezmatched[! apply(entrezmatched[,c(1,3)], 1,function (x) anyNA(x)),]
    allLLIDs <- entrezmatched$entrezgene
    ent_uni <- rownames(oligos@assays$RNA)
    ent_uni_entrez <- bitr(ent_uni, fromType = "SYMBOL",
            toType = "ENTREZID",
            OrgDb = org.Hs.eg.db)
         
    ent_uni <- ent_uni_entrez$ENTREZID
         
         
        
   ego <- enrichGO(gene  = as.character(allLLIDs),
                OrgDb = org.Hs.eg.db,
                ont = "BP",
                universe = ent_uni )
   
   write.csv(ego, here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_avg_log2FC-04", "data",
                       paste0("Oligo_", cluster, ".csv")))
                
}
```



## Top100 DE Genes, manually filtered for avg_logFC > 0.5 & p_val_adj < 0.05 from original analysis 

### Read-in Cluster Marker Lists:
```{r}
oligo_file_path_orig <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Input_Data", "DE_Genes", "DE_Genes_orig_analysis")

oligo_files_orig <- list.files(oligo_file_path_orig)
```


### Prepare marts to convert to entrez
```{r}
listMarts()
ensembl = useMart("ensembl", dataset="hsapiens_gene_ensembl",
                      host = "www.ensembl.org")
    
    
listDatasets(ensembl)
attributes = listAttributes(ensembl)
named_list <- list()
for(i in 1: length(oligo_files_orig)){
  data <- read.csv(here(oligo_file_path, oligo_files_orig[i]))
  cluster <- data[["cluster"]][1]
  
  
  genes <- data$gene
  
  # convert to ENTREZID
  
  genes_ent <- bitr(genes, fromType = "SYMBOL",
            toType = "ENTREZID",
            OrgDb = org.Hs.eg.db)
  
  named_list[[i]] <- genes_ent$ENTREZID
  names(named_list)[i] <- cluster
}
str(named_list)
ck_go <- compareCluster(geneCluster = named_list, fun = enrichGO, 
                          OrgDb = org.Hs.eg.db)
ck_go <- setReadable(ck_go, OrgDb = org.Hs.eg.db, keyType="ENTREZID")
  
```

```{r fig.height=22, fig.width=15}
ck_kegg <- compareCluster(geneCluster = named_list, fun = enrichKEGG)
ck_kegg <- setReadable(ck_kegg, OrgDb = org.Hs.eg.db, keyType="ENTREZID")
head(ck_kegg) 
dotplot(ck_kegg)
```

```{r fig.height=22, fig.width=15, eval=FALSE}
dp <- dotplot(ck_kegg) +
  theme(axis.text.x = element_text(angle = 90), axis.text.x.bottom = element_text(size = 15))
  
pdf(file = here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Plots", "Oligos_orig_analysis.pdf"),   
              width = 15, 
              height = 22) 
print(dp)
dev.off()
```


```{r eval=FALSE}
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Cluster_Plots"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "EMAP_Plots"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Molecular_Functions"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Biological_Process"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Symbol"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Tree"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Upset_Plots"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "HeatMaps"))
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Molecular_Functions_Tree"))

go_dir <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Cluster_Plots")
emap_dir <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "EMAP_Plots")
go_dir_mol_f <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Molecular_Functions")
go_dot <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Biological_Process")
go_dir_cnetpl <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Symbol")
go_dir_tree <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Tree")
go_dir_upset_plot <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Upset_Plots")
go_dir_hm <-here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "HeatMaps")
go_dir_mol_f_tree <- here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "Molecular_Functions_Tree")

for(i in 1: length(oligo_files_orig)){
  data <- read.csv(here(oligo_file_path_orig, oligo_files_orig[i]))
  cluster <- data[["cluster"]][1]
  plot_label <- paste0(cluster, ".pdf")
  
  
  # Prepare input data
  ranks <- data %>% dplyr::select(gene, avg_log2FC)
  ranks <- deframe(ranks)
  #RUN GO using clusterProfiler
    data$Gene <- data$gene
    #remove any duplicates (sanity check for me)
    genemodulesGO <- data[!duplicated(data$gene),]
    
    Biomart_gencode_ensembl84_biotypes <- getBM(attributes=c("ensembl_gene_id",
                                                             "entrezgene_id",
                                                             "gene_biotype",
                                                             "hgnc_symbol"), 
                                                filters = "hgnc_symbol", 
                                                values = data$gene, 
                                                mart = ensembl,
                         uniqueRows = FALSE)
    
    
    Biomart_gencode_ensembl84_biotypes[, 'gene_biotype'] <- 
      as.factor(Biomart_gencode_ensembl84_biotypes[,'gene_biotype'])
    
    #Filter genes
    #biotype_all_dataset <- subset(Biomart_gencode_ensembl84_biotypes, hgnc_symbol 
    #                              %in% oligos@assays$RNA@var.features)
    #entrezID <-  subset(biotype_all_dataset, biotype_all_dataset$hgnc_symbol
    #                    %in% oligos@assays$RNA@var.features)
   # if(nrow(biotype_all_dataset) == 0){
      biotype_all_dataset <- subset(Biomart_gencode_ensembl84_biotypes, hgnc_symbol 
                                  %in% rownames(oligos@assays$RNA))
      
      entrezID <-  subset(biotype_all_dataset, biotype_all_dataset$hgnc_symbol
                        %in% rownames(oligos@assays$RNA))
   # }
    
    entrezmatched <- entrezID[match(genemodulesGO$Gene,entrezID$hgnc_symbol),]
  
    #you might need to remove NAs
    entrezmatched <- entrezmatched[! apply(entrezmatched[,c(1,3)], 1,function (x) anyNA(x)),]
    allLLIDs <- entrezmatched$entrezgene
    
    modulesReactome <- enrichPathway(gene=as.character(allLLIDs),organism="human",
                                        pvalueCutoff=0.01,
                                        qvalueCutoff = 0.3,
                                        pAdjustMethod = "none", 
                                        readable=T)
    
    
  
    
    if(nrow(as.data.frame(modulesReactome)) > 0){
        dot_plot <- dotplot(modulesReactome, showCategory=20)
        
        pdf(file = here(go_dir, plot_label),   
          width = 12, 
          height = 12) 
        print(dot_plot)
        dev.off()
      
      
        x2 <- pairwise_termsim(modulesReactome) 
        emap_plot <- emapplot(x2)
        
        pdf(file = here(emap_dir, plot_label),   
          width = 12, 
          height = 12) 
        
        print(emap_plot)
        
        dev.off()
        
     #Alternative GO clasissification
        
         ent_uni <- rownames(oligos@assays$RNA)
         ent_uni_entrez <- bitr(ent_uni, fromType = "SYMBOL",
            toType = "ENTREZID",
            OrgDb = org.Hs.eg.db)
         
         ent_uni <- ent_uni_entrez$ENTREZID
         
         # Molecular function
         
          go_mf <- enrichGO(gene  = as.character(allLLIDs),
                universe      = ent_uni,
                OrgDb         = org.Hs.eg.db,
                ont           = "MF",
                pAdjustMethod = "BH",
                pvalueCutoff  = 1,
                qvalueCutoff  = 1,
                readable      = TRUE)
         
          
          go_mf_plot <- dotplot(go_mf, showCategory = 20)
          pdf(file = here(go_dir_mol_f, plot_label),   
                 width = 12, 
                height = 10) 
          print(go_mf_plot)
          dev.off()
          
        # GO biological process
        
        ego <- enrichGO(gene  = as.character(allLLIDs),
                OrgDb = org.Hs.eg.db,
                ont = "BP",
                universe = ent_uni )
                
        #head(ego)
        if(nrow(ego) > 0){
        dp2 <- dotplot(ego, showCategory=20)
    
    
        pdf(file = here(go_dot, plot_label),   
            width = 12, 
            height = 10) 
        print(dp2)
        dev.off()
        
        ## convert gene ID to Symbol
        edox <- setReadable(ego, 'org.Hs.eg.db', 'ENTREZID')
        cnet_pl <- cnetplot(edox, foldChange=ranks, circular = TRUE, 
                       colorEdge = TRUE)
    
    
        pdf(file = here(go_dir_cnetpl, plot_label),   
              width = 14, 
              height = 12) 
        print(cnet_pl)
        dev.off()
        
        
        
        edox2 <- enrichplot::pairwise_termsim(edox)
       
        
        if(nrow(edox2) > 1){
        tree_plot <- treeplot(edox2)
        
        pdf(file = here(go_dir_tree, plot_label),   # The directory you want to save the file in
              width = 16, # The width of the plot in inches
              height = 10) # The height of the plot in inches
        print(tree_plot)
        dev.off()
        }
        
       u_plot <- upsetplot(ego)
       
       pdf(file = here(go_dir_upset_plot, plot_label),   
              width = 15, 
              height = 10) 
        print(u_plot)
        dev.off()
       
        if(nrow(edox) > 1){
            hm <- heatplot(edox, foldChange=ranks, showCategory=5)
       
            pdf(file = here(go_dir_hm, plot_label),   # The directory you want to save the file in
              width = 15, # The width of the plot in inches
              height = 10) # The height of the plot in inches
            print(hm)
            dev.off()
        }
        }
     
        
        
        edox_mf <- setReadable(go_mf, 'org.Hs.eg.db', 'ENTREZID')
        edox2_mf <- pairwise_termsim(edox_mf)
        tree_plot_mf <- treeplot(edox2_mf)
        
        pdf(file = here(go_dir_mol_f_tree, plot_label),   
              width = 16, 
              height = 10) 
        print(tree_plot_mf)
        dev.off()
        
        }
}
```


### Create and save files that contain GO output for oligos:
```{r eval=FALSE, message=FALSE, warning=FALSE}
dir.create(here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "data"))
for(i in 1: length(oligo_files)){
  data <- read.csv(here(oligo_file_path_orig, oligo_files_orig[i]))
  cluster <- data[["cluster"]][1]
  plot_label <- paste0("Oligo_", cluster, ".pdf")
  
 
  #RUN GO using clusterProfiler
    data$Gene <- data$gene
    #remove any duplicates (sanity check for me)
    genemodulesGO <- fil_data[!duplicated(fil_data$Gene),]
    
    Biomart_gencode_ensembl84_biotypes <- getBM(attributes=c("ensembl_gene_id",
                                                             "entrezgene_id",
                                                             "gene_biotype",
                                                             "hgnc_symbol"), 
                                                filters = "hgnc_symbol", 
                                                values = fil_data$Gene, 
                                                mart = ensembl,
                         uniqueRows = FALSE)
    
    
    Biomart_gencode_ensembl84_biotypes[, 'gene_biotype'] <- 
      as.factor(Biomart_gencode_ensembl84_biotypes[,'gene_biotype'])
    
    biotype_all_dataset <- subset(Biomart_gencode_ensembl84_biotypes, hgnc_symbol 
                                  %in% rownames(oligos@assays$RNA))
      
    entrezID <-  subset(biotype_all_dataset, biotype_all_dataset$hgnc_symbol
                        %in% rownames(oligos@assays$RNA))
  
    
    entrezmatched <- entrezID[match(genemodulesGO$Gene,entrezID$hgnc_symbol),]
  
    #you might need to remove NAs
    entrezmatched <- entrezmatched[! apply(entrezmatched[,c(1,3)], 1,function (x) anyNA(x)),]
    allLLIDs <- entrezmatched$entrezgene
    ent_uni <- rownames(oligos@assays$RNA)
    ent_uni_entrez <- bitr(ent_uni, fromType = "SYMBOL",
            toType = "ENTREZID",
            OrgDb = org.Hs.eg.db)
         
    ent_uni <- ent_uni_entrez$ENTREZID
         
         
        
   ego <- enrichGO(gene  = as.character(allLLIDs),
                OrgDb = org.Hs.eg.db,
                ont = "BP",
                universe = ent_uni )
   
   write.csv(ego, here("outs", "Processed", "Oligodendroglia", "GO_Analysis", "Output_Data", "DE_Genes_orig_analysis", "data",
                       paste0("Oligo_", cluster, ".csv")))
                
}
```


```{r}
sessionInfo()
```